Number of features and counts and percentage of mitochondrial and ribosomal RNA are plotted below. Red lines mark limits for filtering.
alldata <- PercentageFeatureSet(alldata, "^mt-", col.name = "percent_mito")
alldata <- PercentageFeatureSet(alldata, "^Rp[sl]", col.name = "percent_ribo")
alldata <- PercentageFeatureSet(alldata, "^Hb.-", col.name = "percent_hb")
mito_thresh = 10
ribo_thresh = 5
feat_thresh = 400
hb_thresh = 25
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
glist = VlnPlot(alldata, group.by = "orig.ident", features = feats, combine = FALSE, pt.size = 0)
names(glist) = feats
glist = lapply(glist, function(x){x + NoLegend() +
geom_point(position = position_jitter(), alpha = 0.1, size = 0.1)})
cowplot::plot_grid(align = "v",plotlist = list(glist[["nFeature_RNA"]] + geom_hline(yintercept = feat_thresh, color = "red"),
glist[["nCount_RNA"]],
glist[["percent_mito"]] + geom_hline(yintercept = mito_thresh, color = "red") ,
glist[["percent_ribo"]] + geom_hline(yintercept = ribo_thresh, color = "red")),
glist[["percent_hb"]] + geom_hline(yintercept = hb_thresh, color = "red"))
FeatureScatter(alldata, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5,
shuffle = TRUE) + geom_hline(yintercept = feat_thresh)
FeatureScatter(alldata, "percent_ribo", "percent_mito", group.by = "orig.ident", pt.size = 0.5,
shuffle = TRUE) + geom_hline(yintercept = mito_thresh) +
geom_hline(yintercept = mito_thresh) +
geom_vline(xintercept = ribo_thresh)
FeatureScatter(subset(alldata, idents = c("GW_EPI_d0", "STD_EPI_d0")),
"percent_ribo", "percent_mito", group.by = "orig.ident",
pt.size = 0.5, shuffle = TRUE, raster = TRUE) +
geom_hline(yintercept = mito_thresh) +
geom_vline(xintercept = ribo_thresh)
The same QC as above are plotted below, only including cells and genes that pass filtering limits (gene limit: > 3 copies expressed). The top expressed genes are also shown. Mitochondrial and ribosomal genes are filtered out of the data.
selected_c <- WhichCells(alldata, expression = nFeature_RNA > feat_thresh)
selected_f <- rownames(alldata)[Matrix::rowSums(alldata) > 3]
data.filt <- subset(alldata, features = selected_f, cells = selected_c)
selected_mito <- WhichCells(data.filt, expression = percent_mito < 10)
selected_ribo <- WhichCells(data.filt, expression = percent_ribo > 5)
selected_hb <- WhichCells(data.filt, expression = percent_hb < 25)
# and subset the object to only keep those cells
data.filt <- subset(data.filt, cells = intersect(selected_hb,intersect(selected_mito, selected_ribo)))
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(data.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 2) +
NoLegend()
VlnPlot(data.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2) +
NoLegend()
par(mar = c(4, 8, 2, 1))
C <- data.filt@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
# potentials = list()
# for(i in seq(1,dim(C)[2],20000)){
# potentials[[length(potentials)+1]] = rownames(C)[order(apply(C[,i:min(i+19999, dim(C)[2])], 1, median), decreasing = T)[1000:1]]
# }
# length(unique(unlist(potentials)))
#
#
# most_expressed <- order(apply(C[unique(unlist(potentials)),], 1, median), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)
# Filter Malat1, Gm42418
data.filt <- data.filt[!grepl("Malat1", rownames(data.filt)), ]
data.filt <- data.filt[!grepl("Gm42418", rownames(data.filt)), ]
# Filter Mitocondrial
data.filt <- data.filt[!grepl("^mt-", rownames(data.filt)), ]
# Filter Ribossomal gene (optional if that is a problem on your data) data.filt
data.filt <- data.filt[ ! grepl('^Rp[sl]', rownames(data.filt)), ]
# Filter Hemoglobin gene (optional if that is a problem on your data)
data.filt <- data.filt[!grepl("^Hb.-", rownames(data.filt)), ]
# saveRDS(data.filt, "../analysis/data_pre_cellcycle.rds")
The S-phase and G2M-phase scores are calculated based on the cell cycle gene lists defined in the Seurat package (translated to mouse genes using biomaRt).
# Before running CellCycleScoring the data need to be normalized and
# logtransformed.
data.filt = NormalizeData(data.filt)
data.filt <- CellCycleScoring(object = data.filt, g2m.features = g2mFeat,
s.features = sFeat)
VlnPlot(data.filt, features = c("S.Score", "G2M.Score"), group.by = "orig.ident",
ncol = 3, pt.size = 0.1)
Doublets are identified and excluded using the DoubletFinder package.
if(file.exists(( "../analysis/filtered_data_presweep.rds"))){
data.filt = readRDS( "../analysis/filtered_data_presweep.rds")
}else{
data.filt = FindVariableFeatures(data.filt, verbose = F)
data.filt = ScaleData(data.filt, vars.to.regress = c("nFeature_RNA", "percent_mito"),
verbose = F)
data.filt = RunPCA(data.filt, verbose = F, npcs = 20)
data.filt = RunUMAP(data.filt, dims = 1:10, verbose = F)
saveRDS(data.filt, "../analysis/filtered_data_presweep.rds")
}
if(file.exists("../analysis/sweeplist.RData")){
load("../analysis/sweeplist.RData")
}else{
sweeplist=list()
for(i in unique(data.filt$orig.ident)){
data.filt.subset=subset(data.filt, idents = i)
sweep.res <- paramSweep_v3(data.filt.subset)#, num.cores = 3)
sweep.stats <- summarizeSweep(sweep.res, GT = FALSE)
bcmvn <- find.pK(sweep.stats)
sweeplist[[i]] = bcmvn
}
save(sweeplist, file = "../analysis/sweeplist.RData")
}
if(file.exists("../analysis/singletList.RData")){
load("../analysis/singletList.RData")
}else{
singletList = list()
for(i in unique(data.filt$orig.ident)){
data.filt.subset=subset(data.filt, idents = i)
nExp <- round(ncol(data.filt.subset) * 0.08) # expect 8% doublets
mypK = as.numeric(as.character(sweeplist[[i]]$pK[which.max(sweeplist[[i]]$BCmetric)]))
data.filt.subset <- doubletFinder_v3(data.filt.subset, pN = 0.25, pK = mypK, nExp = nExp, PCs = 1:10)
DF.name = colnames(data.filt.subset@meta.data)[grepl("DF.classification", colnames(data.filt.subset@meta.data))]
singletList[[i]] = data.filt.subset@meta.data[, DF.name]
names(singletList[[i]]) = colnames(data.filt.subset)
}
save(singletList, file = "../analysis/singletList.RData")
}
# sweep.res <- paramSweep_v3(data.filt)#, num.cores = 3)
# sweep.stats <- summarizeSweep(sweep.res, GT = FALSE)
# bcmvn <- find.pK(sweep.stats)
# par(mar = c(4, 4, 2, 2))
# barplot(bcmvn$BCmetric, names.arg =bcmvn$pK, las=2)
#data.filt@meta.data$DF = unlist(singletList)[match(colnames(data.filt), gsub(".*\\.","",names(unlist(singletList))))]
data.filt = AddMetaData(data.filt, unlist(singletList)[match(colnames(data.filt), gsub(".*\\.","",names(unlist(singletList))))], "DF")
cowplot::plot_grid(ncol = 2, DimPlot(data.filt, group.by = "orig.ident") + NoAxes(),
DimPlot(data.filt, group.by = "DF") + NoAxes())
DimPlot(data.filt, group.by = "DF", split.by = "orig.ident", ncol = 4) + NoAxes()
VlnPlot(data.filt, features = "nFeature_RNA", group.by = "DF",pt.size = 0.1)
VlnPlot(data.filt, features = "nFeature_RNA", split.by = "DF", group.by = "orig.ident", pt.size = 0.1)
data.filt = data.filt[, data.filt@meta.data[, "DF"] == "Singlet"]
cellfreq = as.data.frame(table(data.filt$orig.ident), stringsAsFactors = FALSE)
cellfreq$Day = sapply(cellfreq$Var1, function(x){strsplit(x, "_")[[1]][3]})
cellfreq$Type = sapply(cellfreq$Var1, function(x){strsplit(x, "_")[[1]][2]})
cellfreq$Treatment = sapply(cellfreq$Var1, function(x){strsplit(x, "_")[[1]][1]})
#png("/Volumes/tank/Private/Meetings/Lab meetings/MyLabMeetings/20210621/FilteredCellNumbers.png", res = 240, width = 1200, height = 800)
ggplot(cellfreq, aes(x = paste(Day, Treatment), y = Freq, color = Type)) + geom_point() +
theme(panel.background = element_blank(), legend.key = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Day and treatment", y = "Number of cells") +
ylim(0,NA)
#dev.off()
Final cell counts:
| Var1 | Freq |
|---|---|
| GW_DN_d0 | 13825 |
| GW_EPI_d0 | 2927 |
| GW_IMM_d0 | 15715 |
| STD_DN_d0 | 14575 |
| STD_EPI_d0 | 5085 |
| STD_IMM_d0 | 15762 |
saveRDS(data.filt, file = "../analysis/filtered_data.rds")
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Users/jenfra/miniconda3/envs/singlecell/lib/libopenblasp-r0.3.15.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DoubletFinder_2.0.3 Matrix_1.4-1 gplots_3.1.1 rafalib_1.0.0 enrichR_3.0 pheatmap_1.0.12 ggplot2_3.3.6
## [8] cowplot_1.1.1 dplyr_1.0.9 venn_1.11 sp_1.5-0 SeuratObject_4.1.0 Seurat_4.1.1
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.7 igraph_1.3.4 lazyeval_0.2.2 splines_4.0.5 listenv_0.8.0 scattermore_0.8 digest_0.6.29
## [8] htmltools_0.5.3 rsconnect_0.8.25 fansi_1.0.3 magrittr_2.0.3 tensor_1.5 cluster_2.1.2 ROCR_1.0-11
## [15] limma_3.46.0 globals_0.16.0 matrixStats_0.62.0 spatstat.sparse_2.1-1 colorspace_2.0-3 ggrepel_0.9.1 xfun_0.31
## [22] crayon_1.5.1 jsonlite_1.8.0 progressr_0.10.0 spatstat.data_2.2-0 survival_3.3-1 zoo_1.8-9 glue_1.6.2
## [29] polyclip_1.10-0 gtable_0.3.0 leiden_0.3.9 future.apply_1.8.1 abind_1.4-5 scales_1.1.1 DBI_1.1.3
## [36] spatstat.random_2.2-0 miniUI_0.1.1.1 Rcpp_1.0.9 viridisLite_0.4.0 xtable_1.8-4 reticulate_1.25 spatstat.core_2.4-4
## [43] bit_4.0.4 htmlwidgets_1.5.4 httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.2 ica_1.0-2 pkgconfig_2.0.3
## [50] farver_2.1.0 sass_0.4.0 uwot_0.1.11 deldir_1.0-6 utf8_1.2.2 tidyselect_1.1.2 labeling_0.4.2
## [57] rlang_1.0.4 reshape2_1.4.4 later_1.3.0 munsell_0.5.0 tools_4.0.5 cachem_1.0.6 cli_3.3.0
## [64] generics_0.1.3 ggridges_0.5.3 evaluate_0.15 stringr_1.4.0 fastmap_1.1.0 yaml_2.3.5 goftest_1.2-3
## [71] knitr_1.37 bit64_4.0.5 fitdistrplus_1.1-8 caTools_1.18.2 admisc_0.29 purrr_0.3.4 RANN_2.6.1
## [78] pbapply_1.5-0 future_1.27.0 nlme_3.1-155 mime_0.12 hdf5r_1.3.5 compiler_4.0.5 rstudioapi_0.13
## [85] plotly_4.10.0 curl_4.3.2 png_0.1-7 spatstat.utils_2.3-1 tibble_3.1.8 bslib_0.4.0 stringi_1.7.8
## [92] highr_0.9 rgeos_0.5-9 lattice_0.20-45 vctrs_0.4.1 pillar_1.8.0 lifecycle_1.0.1 spatstat.geom_2.4-0
## [99] lmtest_0.9-40 jquerylib_0.1.4 RcppAnnoy_0.0.19 data.table_1.14.2 bitops_1.0-7 irlba_2.3.5 httpuv_1.6.5
## [106] patchwork_1.1.1 R6_2.5.1 bookdown_0.24 promises_1.2.0.1 KernSmooth_2.23-20 gridExtra_2.3 parallelly_1.32.1
## [113] codetools_0.2-18 MASS_7.3-55 gtools_3.9.2 assertthat_0.2.1 rjson_0.2.21 withr_2.5.0 sctransform_0.3.3
## [120] mgcv_1.8-39 parallel_4.0.5 grid_4.0.5 rpart_4.1.16 tidyr_1.2.0 rmarkdown_2.13 Rtsne_0.15
## [127] shiny_1.7.2